The workflow is divided into two parts: 1) Data preparation and QC 2) Differential Expression analysis.

In Part 1, GeoMX data processing and QC uses the R package GeoMXWorkflows following the vignette: https://bioconductor.org/packages/3.14/workflows/vignettes/GeoMxWorkflows/inst/doc/GeomxTools_RNA-NGS_Analysis.html

In Part 2, the gene-level count data from GeoMXWorkflows is used as input to the R package DESeq2 for differential expression analysis.

1. Data processing and QC

1.1 Load files from GeoMX platform

dcc, pkc, and sample annotation (xlsx) files

#Read in data files
datadir <- "/orange/timgarrett/GE5701-MThompson-S1-HW525DRXY-Lane1-327318995/GeoMx_NGS_Pipeline_02_04_2022_5_05_33-523906740/GeoMx_NGS_Pipeline_02_04_2022_5_05_33-ds.9555984bb7004944a9f4a6ff1ff8c256/"
DCCFiles <- dir(datadir, pattern=".dcc$", full.names=TRUE)
PKCFiles <- "Hs_R_NGS_WTA_v1.0.pkc"
SampleAnnotationFile <- "CThompsonAnnotations.xlsx"

#Create the GeoMX object
ThompsonData <-readNanoStringGeoMxSet(dccFiles = DCCFiles,
                                      pkcFiles = PKCFiles,
                                      phenoDataFile = SampleAnnotationFile,
                                      phenoDataSheet = "Annotations",
                                      phenoDataDccColName = "Sample_ID",
                                      protocolDataColNames = c("aoi","roi"),
                                      experimentDataColNames = c("panel"))
## Warning in readNanoStringGeoMxSet(dccFiles = DCCFiles, pkcFiles =
## PKCFiles, : DCC files missing for the following: DSP-1001660006002-A-F08.dcc,
## DSP-1001660006002-A-F05.dcc, These will be excluded from the GeoMxSet object.
pkcs <- annotation(ThompsonData)
modules <- gsub(".pkc", "", pkcs)
kable(data.frame(PKCs = pkcs, modules = modules))
PKCs modules
Hs_R_NGS_WTA_v1.0.pkc Hs_R_NGS_WTA_v1.0

1.2 Sample Overview

Prepare annotation data for sample overview and QC

# select the annotations we want to show, use `` to surround column names with
# spaces or special symbols
svarLabels(ThompsonData)
##  [1] "slide name"        "Case"              "Slice"            
##  [4] "scan name"         "segment"           "area"             
##  [7] "organ_region"      "slice_region"      "compartment"      
## [10] "FileVersion"       "SoftwareVersion"   "Date"             
## [13] "SampleID"          "Plate_ID"          "Well"             
## [16] "SeqSetId"          "Raw"               "Trimmed"          
## [19] "Stitched"          "Aligned"           "umiQ30"           
## [22] "rtsQ30"            "DeduplicatedReads" "roi"              
## [25] "aoi"               "NTC_ID"            "NTC"
count_mat <- dplyr::count(pData(ThompsonData), `slide name`, organ_region, compartment, segment)

# simplify the slide names
count_mat$`slide name` <- gsub(" WTA 12-15-21", "",count_mat$`slide name`)

# gather the data and plot in order: class, slide name, region, segment
test_gr <- gather_set_data(count_mat, 1:4)
test_gr$x <- factor(test_gr$x,
                    levels = c("compartment","slide name","organ_region","segment"))

Make a chart of our samples before QC filtering (this helps visually track what, if any, samples are removed by QC steps)

# plot Sankey. annotate yend and hjust will depend on total samples
ggplot(test_gr, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = compartment), alpha = 0.5, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2) +
  geom_parallel_sets_labels(color = "white", size = 3) +
  theme_classic(base_size = 17) + 
  theme(legend.position = "bottom",
        axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        axis.text.y = element_blank()) +
  scale_y_continuous(expand = expansion(0)) + 
  scale_x_discrete(expand = expansion(0)) +
  labs(x = "", y = "") +
  annotate(geom = "segment", x = 4.25, xend = 4.25,
           y = 1, yend = 92, lwd = 2) +
  annotate(geom = "text", x = 4.19, y = 70, angle = 90, size = 5,
           hjust = 1.3, label = "83 samples")

1.3 Per-sample quality Control

Set QC parameters that will be used for sample QC

# Shift counts one
ThompsonData <- shiftCountsOne(ThompsonData, useDALogic = TRUE)

##Select Segment QC
# Default QC cutoffs are commented in () adjacent to the respective parameters
# study-specific values were selected after visualizing the QC results in more
# detail below
QC_params <-
  list(minSegmentReads = 1000, # Minimum number of reads (1000)
       percentTrimmed = 80,    # Minimum % of reads trimmed (80%)
       percentStitched = 80,   # Minimum % of reads stitched (80%)
       percentAligned = 75,    # Minimum % of reads aligned (80%)
       percentSaturation = 50, # Minimum sequencing saturation (50%)
       minNegativeCount = 1,   # Minimum negative control counts (10)
       maxNTCCount = 9000,     # Maximum counts observed in NTC well (1000)
       minNuclei = 20,         # Minimum # of nuclei estimated (100)
       minArea = 1000)         # Minimum segment area (5000)

Add QC results to the GeoMX data object

ThompsonData <-
  setSegmentQCFlags(ThompsonData, 
                    qcCutoffs = QC_params)        

# Collate QC Results
QCResults <- protocolData(ThompsonData)[["QCFlags"]]
flag_columns <- colnames(QCResults)
QC_Summary <- data.frame(Pass = colSums(!QCResults[, flag_columns]),
                         Warning = colSums(QCResults[, flag_columns]))
QCResults$QCStatus <- apply(QCResults, 1L, function(x) {
  ifelse(sum(x) == 0L, "PASS", "WARNING")
})
QC_Summary["TOTAL FLAGS", ] <-
  c(sum(QCResults[, "QCStatus"] == "PASS"),
    sum(QCResults[, "QCStatus"] == "WARNING"))

Function to make histograms for each QC metric

# Graphical summaries of QC statistics plot function
QC_histogram <- function(assay_data = NULL,
                         annotation = NULL,
                         fill_by = NULL,
                         thr = NULL,
                         scale_trans = NULL) {
  plt <- ggplot(assay_data,
                aes_string(x = paste0("unlist(`", annotation, "`)"),
                           fill = fill_by)) +
    geom_histogram(bins = 50) +
    geom_vline(xintercept = thr, lty = "dashed", color = "black") +
    theme_bw() + guides(fill = "none") +
    facet_wrap(as.formula(paste("~", fill_by)), nrow = 4) +
    labs(x = annotation, y = "Samples, #", title = annotation)
  if(!is.null(scale_trans)) {
    plt <- plt +
      scale_x_continuous(trans = scale_trans)
  }
  plt
}

Run QC metric histogram function to view QC results for each metric

#Explore QC metrics for the segments
col_by <- "segment"
QC_histogram(sData(ThompsonData), "Trimmed (%)", col_by, 80)

QC_histogram(sData(ThompsonData), "Stitched (%)", col_by, 80)

QC_histogram(sData(ThompsonData), "Aligned (%)", col_by, 75)

QC_histogram(sData(ThompsonData), "Saturated (%)", col_by, 50) +
  labs(title = "Sequencing Saturation (%)",
       x = "Sequencing Saturation (%)")

QC_histogram(sData(ThompsonData), "area", col_by, 1000, scale_trans = "log10")

#plot all of the QC Summary information in a table.
kable(QC_Summary, caption = "QC Summary Table for each Segment")
QC Summary Table for each Segment
Pass Warning
LowReads 83 0
LowTrimmed 83 0
LowStitched 83 0
LowAligned 83 0
LowSaturation 83 0
LowNegatives 83 0
HighNTC 83 0
LowArea 83 0
TOTAL FLAGS 83 0

Remove flagged samples based on sample-QC metrics

#Number of samples pre-QC filtering
dim(ThompsonData)

#Removal
ThompsonData <- ThompsonData[, QCResults$QCStatus == "PASS"]

#Number of samples post-QC filtering
dim(ThompsonData)

1.4 Per-probe quality Control

Set Probe QC Flags and view what probes passed or failed

# Generally keep the qcCutoffs parameters unchanged. Set removeLocalOutliers to 
# FALSE if you do not want to remove local outliers
ThompsonData <- setBioProbeQCFlags(ThompsonData, 
                               qcCutoffs = list(minProbeRatio = 0.1,
                                                percentFailGrubbs = 20), 
                               removeLocalOutliers = TRUE)

ProbeQCResults <- fData(ThompsonData)[["QCFlags"]]

# Define QC table for Probe QC
qc_df <- data.frame(Passed = sum(rowSums(ProbeQCResults[, -1]) == 0),
                    Global = sum(ProbeQCResults$GlobalGrubbsOutlier),
                    Local = sum(rowSums(ProbeQCResults[, -2:-1]) > 0
                                & !ProbeQCResults$GlobalGrubbsOutlier))
#View results
qc_df
##   Passed Global Local
## 1  18813      1     1

Exclude probes that did not pass QC and outlier probes

#Make an object that is a subset of probes that passed QC
ProbeQCPassed <- 
  subset(ThompsonData, 
         fData(ThompsonData)[["QCFlags"]][,c("LowProbeRatio")] == FALSE &
           fData(ThompsonData)[["QCFlags"]][,c("GlobalGrubbsOutlier")] == FALSE)
#View number of features pre-QC filtering
dim(ThompsonData)
## Features  Samples 
##    18815       83
#View number of features post-QC filtering
dim(ProbeQCPassed)
## Features  Samples 
##    18814       83
#If desired filtering is achieved, replace original object with filtered object
ThompsonData <- ProbeQCPassed 

1.5 Prepare GeoMX object for differential expression analysis with DESeq2

Create gene-level count data

# Check how many unique targets the object has
length(unique(featureData(ThompsonData)[["TargetName"]]))
## [1] 18677
# collapse to targets
target_ThompsonData <- aggregateCounts(ThompsonData)
dim(target_ThompsonData)
## Features  Samples 
##    18677       83
##Convert for DESeq
GeoMX_counts <- as.matrix(exprs(target_ThompsonData))
mode(GeoMX_counts) <- "integer"

Create annotation data formatted for DESeq2

##Annotation Data for DESeq2
my.coldata <- read.csv("CThompsonAnnotationsDESeq.csv",row.names = 1)

#Adjust annotation samples names to match counts rows names
colnames(GeoMX_counts) <- sub(".dcc", "", colnames(GeoMX_counts))

#Check that rownames match and are in the right order
all(rownames(my.coldata) %in% colnames(GeoMX_counts))
## [1] TRUE
all(rownames(my.coldata) == colnames(GeoMX_counts))
## [1] TRUE
#make annotation data factors
my.coldata$compartment <- factor(my.coldata$compartment)
my.coldata$slide_region <- factor(my.coldata$slide_region)
my.coldata$slide_name <- factor(my.coldata$slide_name)
my.coldata$segment <- factor(my.coldata$segment)
my.coldata$roi <- factor(my.coldata$roi)

Part 2: Differential gene expression with DESeq

2.1 Explore gene expression variation across all samples by slide region

Create a DESeq data object using the count data and annotation data and and set up the initial design

Run likelihood-ration test because there are more than two levels of “slide_region”

#The steps in part 2-A only use the counts data even though the test for DE is run here.
dds <- DESeq(dds, test="LRT",fitType="local",reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 2 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
#Create a matrix of normalized counts in case it's needed (genes are rownames, samples are col names)
DESeq_counts <- counts(dds, normalized = TRUE)

Create two transformed counts for dimension reductions

dds_norm <- vst(dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
dds_rlog <- rlog(dds)
## rlog() may take a long time with 50 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.

Plot PCA for transformed count data

Use different annotation data for “intgroup” to color the samples based on group (the plot doesn’t change, just the color and legend)

plotPCA(dds_norm, intgroup = "compartment", ntop = 20, returnData = FALSE)

plotPCA(dds_norm, intgroup = c("slide_region","compartment"), ntop = 20, returnData = FALSE)

plotPCA(dds_norm, intgroup = "slide_region", ntop = 20, returnData = FALSE)

Try different values of ntop (number of top genes to use, selected by highest row variance. If we use higher numbers of genes for PCA, the top PC’s explain less variance

plotPCA(dds_norm, intgroup = "compartment", ntop = 50, returnData = FALSE)

plotPCA(dds_norm, intgroup = "compartment", ntop = 100, returnData = FALSE)

plotPCA(dds_norm, intgroup = "compartment", ntop = 500, returnData = FALSE)

UMAP Dimension Reduction on normalized counts

# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- as.data.frame(assay(dds_norm)) %>% t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout)
#Add the annotation data
umap_plot_df <- merge(umap_plot_df,my.coldata,by=0)
ggplot(
  umap_plot_df,
  aes(
    x = X1,
    y = X2,
    color = compartment ,
    shape = slide_region
  )
) +
  geom_point(size=3)

Heatmap of correlation between samples (What samples group together based on gene count data?)

#A matrix of the the rlog counts
rld_mat <- assay(dds_rlog)

#Calculate correlations between samples' counts (all pair-wise comparisons)
rld_cor <- cor(rld_mat)    ## cor() is a base R function

#Add a column to coldata called "sample_type" that is region+compartment
my.coldata$sample_type <- paste(my.coldata$slide_region,my.coldata$compartment,my.coldata$segment)

Plot Heatmaps of sample correlation

#Label samples by sample_type
pheatmap(rld_cor, annotation = my.coldata %>% select(sample_type),show_rownames = FALSE,show_colnames = FALSE)

#Label samples by compartment
pheatmap(rld_cor, annotation = my.coldata %>% select(compartment),show_rownames = FALSE,show_colnames = FALSE)

#Label samples by slide_region
pheatmap(rld_cor, annotation = my.coldata %>% select(slide_region),show_rownames = FALSE,show_colnames = FALSE)

2.2 Differential gene expression between slide_region in sample subsets

2.2.1 First we’ll subset by compartment, starting with all endocrine samples

##Subset just the endocrine
#endocrine compartment samples from coldata
endo.coldata <- my.coldata %>% filter(compartment=="endocrine")
#Head region samples from count matrix
endo.counts <- as.data.frame(GeoMX_counts)
endo.counts <- endo.counts %>% select(rownames(endo.coldata))
endo.counts <- as.matrix(endo.counts)
mode(endo.counts) <- "integer"
is.factor(endo.coldata$compartment)
## [1] TRUE
all(rownames(endo.coldata) == colnames(endo.counts))
## [1] TRUE

Run the test for differential expression (LRT)

#Set up the initial model. The model is the effect of pancreas region on gene expression (for the endocrine compartment)
endo.dds <- DESeqDataSetFromMatrix(countData = endo.counts,
                              colData = endo.coldata,
                              design = ~ slide_region)
#LRT 
endo.dds <- DESeq(endo.dds, test="LRT",fitType="local", reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 2 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
endo.dds_norm <- vst(endo.dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
endo.dds_rlog <- rlog(endo.dds)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.

Dimension reduction of this subset has the same underlying counts data as for all samples above, no need to re-run again, just here for clarity.

Plot the PCA of normalized counts for all endocrine samples colored by slide_region

#The number of top genes used for the PCA analysis has a big effect on the proportion of variance explained by Top PC's
plotPCA(endo.dds_norm, intgroup = "slide_region", ntop = 100, returnData = FALSE)

UMAP dimensional reduction of endocrine samples’ normalized counts

# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
endo.normalized_counts <- as.data.frame(assay(endo.dds_norm)) %>% t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
endo.umap_results <- umap::umap(endo.normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
endo.umap_plot_df <- data.frame(endo.umap_results$layout)
#Add the annotation data
endo.umap_plot_df <- merge(endo.umap_plot_df,endo.coldata,by=0)
ggplot(
  endo.umap_plot_df,
  aes(
    x = X1,
    y = X2,
    color = slide_region
  )
) +
  geom_point()

Heatmap of correlation between endocrine samples suggests body roi share expression profile

#Create a matrix of the rlog transformed gene counts
endo.rld_mat <- assay(endo.dds_rlog) 

#Calculate pair-wise correlation between samples' counts
endo.rld_cor <- cor(endo.rld_mat)    

#Add a column to coldata that is region+compartment for labeling
endo.coldata$sample_type <- paste(endo.coldata$slide_region,endo.coldata$compartment,endo.coldata$segment)

### Plot heatmap
pheatmap(endo.rld_cor,annotation = endo.coldata %>% select(slide_region),show_rownames = FALSE,show_colnames = FALSE)

Get results of DE testing comparing each level of slide_region (tail, head, neck) to the body slide_region

#Get results table for contrasts of interest (the second condition is the control)
res.head.vs.body <- results(endo.dds, contrast=c("slide_region","head","body"))
res.tail.vs.body <- results(endo.dds, contrast=c("slide_region","tail","body"))
res.neck.vs.body <- results(endo.dds, contrast=c("slide_region","neck","body"))
#Make a total results table where there is a test statistics for each contrast (the pvalue will be the same)
endo.results1 <- as.data.frame(res.neck.vs.body)
names(endo.results1)[names(endo.results1) == 'log2FoldChange'] <- 'log2FoldChange.neck.vs.body'
endo.results1 <- endo.results1 %>% select (!lfcSE)
endo.results2 <- as.data.frame(res.head.vs.body)
names(endo.results2)[names(endo.results2) == 'log2FoldChange'] <- 'log2FoldChange.head.vs.body'
endo.results2 <- endo.results2 %>% select (log2FoldChange.head.vs.body)
endo.results3 <- as.data.frame(res.tail.vs.body)
names(endo.results3)[names(endo.results3) == 'log2FoldChange'] <- 'log2FoldChange.tail.vs.body'
endo.results3 <- endo.results3 %>% select (log2FoldChange.tail.vs.body)
endo.results <- merge(endo.results1,endo.results2,by=0)
rownames(endo.results) <- endo.results[,1]
endo.results <- endo.results[,-1]
endo.results <- merge(endo.results,endo.results3,by=0)
rownames(endo.results) <- endo.results[,1]
endo.results <- endo.results[,-1]
endo.results <- endo.results[,order(colnames(endo.results))]
endo.results$Gene <- rownames(endo.results)

Visualize expression of DE genes in three pancreas regions relative to expression in the body of the pancreas

for (i in 2:4){
temp <- endo.results[,c(1,i,5,6,7,8)]
temp$Color <- NULL
temp$Color[temp$pvalue < 0.05] <- "P < 0.05"
temp$Color[temp$padj < 0.05] <- "padj < 0.05"
temp$Color[temp$padj < 0.001] <- "padj < 0.001"
temp$Color[abs(temp[,2]) < 0.5] <- "FC < 0.5"
temp$Color <- factor(temp$Color,levels = c("FC < 0.5", "P < 0.05","padj < 0.05", "padj < 0.001"))

#pick top genes for either side of volcano to label
#subset by positive FC, sort by FDR, and pick top 15
negFC_top_g <- temp %>% filter(temp[,2] > 0.5) %>% arrange(padj) %>% slice_head(n=15) %>% select(Gene)
assign(x = paste0(i,"negFC_top_g"),negFC_top_g)
posFC_top_g <- temp %>% filter(temp[,2] < -0.5) %>% arrange(padj) %>% slice_head(n=15) %>% select(Gene)
assign(x = paste0(i,"posFC_top_g"),posFC_top_g)
top_g <- c(posFC_top_g,negFC_top_g)
top_g <- unlist(top_g, recursive = FALSE)
#Subset the results df just to the top genes for plot labeling
#topgene.df <- temp %>% filter(Gene)
# Graph results
p <- ggplot(temp,
            aes(x = temp[,2], y = -log10(pvalue),
                color = Color, label = Gene)) +
  geom_vline(xintercept = c(0.5, -0.5), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = paste0("Enriched in ",str_split_fixed(names(temp)[2],"\\.",4)[,4]," <- log2(FC) -> Enriched in ",str_split_fixed(names(temp)[2],"\\.",4)[,2],sep=" "),
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`padj < 0.001` = "dodgerblue",
                                `padj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `FC < 0.5` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05)))

p2 <- p+geom_text_repel(data = subset(temp, Gene %in% top_g & padj < 0.05),aes(x = subset(temp, Gene %in% top_g & padj < 0.05)[,2], y = -log10(pvalue)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2,
                  max.overlaps = 50) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
#assign(paste0("plot", i), p2)
print(p2)
}
## Warning: Duplicated aesthetics after name standardisation: size
## Duplicated aesthetics after name standardisation: size

## Warning: Duplicated aesthetics after name standardisation: size

Look at the expression of genes identified as DE in the head region relative to other regions

#Make a list of DE genes that are upregulated in the body relative to all three other regions
#Combine the three upreg lists
up.body = as.data.frame(table(do.call(what = rbind, args =mget(ls(pattern = ".pos")))))
Up.Body_genes <- up.body %>% filter (Freq==3) %>% select(Var1)
down.body = as.data.frame(table(do.call(what = rbind, args =mget(ls(pattern = ".neg")))))
Down.Body_genes <- down.body %>% filter (Freq==3) %>% select(Var1)

#Violin plots of genes that are enriched in body relative to each other region
endo.norm.counts.df <- as.data.frame(t(endo.rld_mat))

#Add the annotation data
endo.norm.counts.df <- merge(endo.norm.counts.df,endo.coldata,by=0)

#violin plot
violin <- function(gene){
ggplot(endo.norm.counts.df[c(gene,"sample_type")],
       aes(x = sample_type, fill = sample_type,
           y = endo.norm.counts.df[c(gene,"sample_type")][,1]))+
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = paste(gene,"Expression (normalized counts)")) +
  theme_bw()
}
lapply(as.list(as.character(Down.Body_genes$Var1)),violin)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

lapply(as.list(as.character(Up.Body_genes$Var1)),violin)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

2.2.2 Now we begin to repeat the steps above for the exocrine compartment samples

##Subset just the exocrine
#exocrine compartment samples from coldata
exo.coldata <- my.coldata %>% filter(compartment=="exocrine")
#Head region samples from count matrix
exo.counts <- as.data.frame(GeoMX_counts)
exo.counts <- exo.counts %>% select(rownames(exo.coldata))
exo.counts <- as.matrix(exo.counts)
mode(exo.counts) <- "integer"
is.factor(exo.coldata$compartment)
## [1] TRUE
all(rownames(exo.coldata) == colnames(exo.counts))
## [1] TRUE
#Set up the initial model. The model is the effect of pancreas region on gene expression (for the exocrine compartment)
exo.dds <- DESeqDataSetFromMatrix(countData = exo.counts,
                              colData = exo.coldata,
                              design = ~ slide_region)
#LRT 
exo.dds <- DESeq(exo.dds, test="LRT",fitType="local", reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
exo.dds_norm <- vst(exo.dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
exo.dds_rlog <- rlog(exo.dds)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.

PCA of normalized counts (exocrine)

#The number of top genes used for the PCA analysis has a big effect on the proportion of variance explained by Top PC's
plotPCA(exo.dds_norm, intgroup = "slide_region", ntop = 20, returnData = FALSE)

#plotPCA(exo.dds_norm, intgroup = "slide_region", ntop = 100, returnData = FALSE)
#plotPCA(exo.dds_norm, intgroup = "slide_region", ntop = 200, returnData = FALSE)
#plotPCA(exo.dds_norm, intgroup = "slide_region", ntop = 500, returnData = FALSE)

Dimension Reduction (exocrine)

# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
exo.normalized_counts <- as.data.frame(assay(exo.dds_norm)) %>% t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
exo.umap_results <- umap::umap(exo.normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
exo.umap_plot_df <- data.frame(exo.umap_results$layout)
#Add the annotation data
exo.umap_plot_df <- merge(exo.umap_plot_df,exo.coldata,by=0)
ggplot(
  exo.umap_plot_df,
  aes(
    x = X1,
    y = X2,
    color = slide_region
  )
) +
  geom_point()

Heatmap of correlation between samples (exocrine) No pancreas region really shares an expression profile for the exocrine compartment roi’s

exo.rld_mat <- assay(exo.dds_rlog) 

exo.rld_cor <- cor(exo.rld_mat)    

#Add a column to coldata that is region+compartment
exo.coldata$sample_type <- paste(exo.coldata$slide_region,exo.coldata$compartment,exo.coldata$segment)

### Plot heatmap
pheatmap(exo.rld_cor,annotation = exo.coldata %>% select(slide_region),show_rownames = FALSE,show_colnames = FALSE)

No group of roi are distinct based on region, so the DE analysis below isn’t run until we decide what the groups of interest are.

#Get results table for contrasts of interest (the second condition is the control)
res.head.vs.body <- results(exo.dds, contrast=c("slide_region","head","body"))
res.tail.vs.body <- results(exo.dds, contrast=c("slide_region","tail","body"))
res.neck.vs.body <- results(exo.dds, contrast=c("slide_region","neck","body"))

#Make a total results table where there is a test statistics for each contrast (the pvalue will be the same)
exo.results1 <- as.data.frame(res.neck.vs.body)
names(exo.results1)[names(exo.results1) == 'log2FoldChange'] <- 'log2FoldChange.neck.vs.body'
exo.results1 <- exo.results1 %>% select (!lfcSE)
exo.results2 <- as.data.frame(res.head.vs.body)
names(exo.results2)[names(exo.results2) == 'log2FoldChange'] <- 'log2FoldChange.head.vs.body'
exo.results2 <- exo.results2 %>% select (log2FoldChange.head.vs.body)
exo.results3 <- as.data.frame(res.tail.vs.body)
names(exo.results3)[names(exo.results3) == 'log2FoldChange'] <- 'log2FoldChange.tail.vs.body'
exo.results3 <- exo.results3 %>% select (log2FoldChange.tail.vs.body)
exo.results <- merge(exo.results1,exo.results2,by=0)
rownames(exo.results) <- exo.results[,1]
exo.results <- exo.results[,-1]
exo.results <- merge(exo.results,exo.results3,by=0)
rownames(exo.results) <- exo.results[,1]
exo.results <- exo.results[,-1]
exo.results <- exo.results[,order(colnames(exo.results))]
exo.results$Gene <- rownames(exo.results)

Visualize expression of DE genes in three pancreas regions relative to expression in the body of the pancreas

##Volcano Plot
##Visualizing DE Genes
#Volcano Plots
#A canonical visualization for interpreting differential gene expression results is the volcano plot. 
library(tidyverse)
library(dplyr)
library(ggrepel) 
library(stringr)

for (i in 2:4){
temp <- exo.results[,c(1,i,5,6,7,8)]
temp$Color <- NULL
temp$Color[temp$pvalue < 0.05] <- "P < 0.05"
temp$Color[temp$padj < 0.05] <- "padj < 0.05"
temp$Color[temp$padj < 0.001] <- "padj < 0.001"
temp$Color[abs(temp[,2]) < 0.5] <- "FC < 0.5"
temp$Color <- factor(temp$Color,levels = c("FC < 0.5", "P < 0.05","padj < 0.05", "padj < 0.001"))

#pick top genes for either side of volcano to label
#subset by positive FC, sort by FDR, and pick top 15
exo.negFC_top_g <- temp %>% filter(temp[,2] > 0.5) %>% arrange(padj) %>% slice_head(n=15) %>% select(Gene)
assign(x = paste0(i,"negFC_top_g"),exo.negFC_top_g)
exo.posFC_top_g <- temp %>% filter(temp[,2] < -0.5) %>% arrange(padj) %>% slice_head(n=15) %>% select(Gene)
assign(x = paste0(i,"posFC_top_g"),exo.posFC_top_g)
exo.top_g <- c(exo.posFC_top_g,exo.negFC_top_g)
exo.top_g <- unlist(exo.top_g, recursive = FALSE)
#Subset the results df just to the top genes for plot labeling
#topgene.df <- temp %>% filter(Gene)
# Graph results
p <- ggplot(temp,
            aes(x = temp[,2], y = -log10(pvalue),
                color = Color, label = Gene)) +
  geom_vline(xintercept = c(0.5, -0.5), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = paste0("Enriched in ",str_split_fixed(names(temp)[2],"\\.",4)[,4]," <- log2(FC) -> Enriched in ",str_split_fixed(names(temp)[2],"\\.",4)[,2],sep=" "),
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`padj < 0.001` = "dodgerblue",
                                `padj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `FC < 0.5` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05)))

p2 <- p+geom_text_repel(data = subset(temp, Gene %in% exo.top_g & padj < 0.05),aes(x = subset(temp, Gene %in% exo.top_g & padj < 0.05)[,2], y = -log10(pvalue)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2,
                  max.overlaps = 50) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
#assign(paste0("plot", i), p2)
print(p2)
}

Look at the expression of genes identified as DE in the head region relative to other regions There are no significant results to plot (unlike for the endocrine compartment)

#Make a list of DE genes that are upregulated in the body relative to all three other regions
#Combine the three upreg lists
exo.up.body = as.data.frame(table(do.call(what = rbind, args =mget(ls(pattern = "^exo.pos")))))
exo.Up.Body_genes <-exo.up.body %>% filter (Freq==3) %>% select(Var1)
exo.down.body = as.data.frame(table(do.call(what = rbind, args =mget(ls(pattern = "^exo.neg")))))
exo.Down.Body_genes <- exo.down.body %>% filter (Freq==3) %>% select(Var1)
#Violin plots of genes that are enriched in body relative to each other region
exo.norm.counts.df <- as.data.frame(t(exo.rld_mat))
#Add the annotation data
exo.norm.counts.df <- merge(exo.norm.counts.df,exo.coldata,by=0)
#violin plot
exo.violin <- function(gene){
ggplot(exo.norm.counts.df[c(gene,"sample_type")],
       aes(x = sample_type, fill = sample_type,
           y = exo.norm.counts.df[c(gene,"sample_type")][,1]))+
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = paste(gene,"Expression (normalized counts)")) +
  theme_bw()
}
lapply(as.list(as.character(exo.Down.Body_genes$Var1)),exo.violin)
lapply(as.list(as.character(exo.Up.Body_genes$Var1)),exo.violin)

2.2.3 Repeat differential expression analysis for subset of Islet samples

Subset the islet segment samples

#islet segment samples from coldata
islet.coldata <- my.coldata %>% filter(segment=="CD31-Islet")

#islet segment samples from count matrix
islet.counts <- as.data.frame(GeoMX_counts)
islet.counts <- islet.counts %>% select(rownames(islet.coldata))
islet.counts <- as.matrix(islet.counts)
mode(islet.counts) <- "integer"
is.factor(islet.coldata$roi)
## [1] TRUE
all(rownames(islet.coldata) == colnames(islet.counts))
## [1] TRUE

Set up the initial DE test model

#The model is the effect of pancreas region on gene expression (for the islet compartment)
islet.dds <- DESeqDataSetFromMatrix(countData = islet.counts,
                              colData = islet.coldata,
                              design = ~ slide_region)
#LRT 
islet.dds <- DESeq(islet.dds, test="LRT",fitType="local", reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Transform the counts data

islet.dds_norm <- vst(islet.dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
islet.dds_rlog <- rlog(islet.dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.

PCA of normalized counts (islet)

#The number of top genes used for the PCA analysis has a big effect on the proportion of variance explained by Top PC's
plotPCA(islet.dds_norm, intgroup = "slide_region", ntop = 20, returnData = FALSE)

#plotPCA(islet.dds_norm, intgroup = "slide_region", ntop = 100, returnData = FALSE)
#plotPCA(islet.dds_norm, intgroup = "slide_region", ntop = 200, returnData = FALSE)
#plotPCA(islet.dds_norm, intgroup = "slide_region", ntop = 500, returnData = FALSE)

Dimension Reduction (islet)

# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
islet.normalized_counts <- as.data.frame(assay(islet.dds_norm)) %>% t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
islet.umap_results <- umap::umap(islet.normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
islet.umap_plot_df <- data.frame(islet.umap_results$layout)
#Add the annotation data
islet.umap_plot_df <- merge(islet.umap_plot_df,islet.coldata,by=0)
ggplot(
  islet.umap_plot_df,
  aes(
    x = X1,
    y = X2,
    color = slide_region
  )
) +
  geom_point()

Heat-map of correlation between islet samples

islet.rld_mat <- assay(islet.dds_rlog) 

islet.rld_cor <- cor(islet.rld_mat)

#Add a column to coldata that is region+compartment
islet.coldata$sample_type <- paste(islet.coldata$slide_region,islet.coldata$compartment,islet.coldata$segment)

#Plot heatmap
pheatmap(islet.rld_cor,annotation = islet.coldata %>% select(slide_region),show_rownames = FALSE,show_colnames = FALSE)

Because no pancreas region really shares an expression profile for the islet compartment roi’s, instead of moving onto looking at DE results we can look at variance per gene within the islet samples.

2.3 Analysis of variance per gene within a group of samples

Calculate the variance per gene and subset the top ten most variable genes

# calculate variance per gene
islet.rv <- as.data.frame(rowVars(assay(islet.dds_rlog)))
#Add the gene names
rownames(islet.rv) <- rownames(assay(islet.dds_rlog))
#Get top ten
topten.islet.rv <- islet.rv %>% arrange(desc(islet.rv$`rowVars(assay(islet.dds_rlog))`)) %>% head(10)

Heat-map of normalized gene counts across islet samples for the top 10 most within-islet-samples variable genes

dataframe <- as.data.frame(assay(islet.dds_rlog))
data_subset <- dataframe[rownames(dataframe) %in% rownames(topten.islet.rv), ]
pheatmap(data_subset,annotation = islet.coldata %>% select(slide_region,segment),show_rownames = TRUE,show_colnames = FALSE)